This data set contains 113,937 loans with 81 variables on each loan, including loan amount, borrower rate (or interest rate), current loan status, borrower income, and many others.
- This data dictionary explains the variables in the data set.
- You are not expected to explore all of the variables in the dataset! Focus your exploration on about 10-15 of them.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv("prosperLoanData.csv")
df.head()
# Exploring the columns
df.info()
81 columns and 113937 rows
the loan amounts and loan amount > 10000 the most convinient way to get it should have which factors.
Credit Score/grade , Prosper Score , borrower Rate , Occupation , etc
# A Count plot of Loan Status in genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(12 , reverse=True)
sns.countplot(x=df['LoanStatus'], ax=ax , order=df['LoanStatus'].value_counts().index , palette=cmap)
plt.xticks(rotation=15)
plt.title("Diffrent Loan Status Counts (in gen)")
for i, v in df['LoanStatus'].value_counts().reset_index().iterrows():
ax.text(i, v.LoanStatus + 0.2 , v.LoanStatus, color='red')
# A Count plot of Loan Status in elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(12 , reverse=True)
sns.countplot(x=df.query("LoanOriginalAmount > 10000")['LoanStatus'], ax=ax , order=df.query("LoanOriginalAmount > 10000")['LoanStatus'].value_counts().index , palette=cmap)
plt.xticks(rotation=15)
plt.title("Diffrent Loan Status Counts (in elite)")
for i, v in df.query("LoanOriginalAmount > 10000")['LoanStatus'].value_counts().reset_index().iterrows():
ax.text(i, v.LoanStatus + 0.2 , v.LoanStatus, color='red')
# Listing category numeric top 5 counts
df = df.rename(columns={"ListingCategory (numeric)":"ListingCategory"})
df['ListingCategory'].value_counts().head()
# A Count plot of Listing Category for genral loans
new = ["Not Available" , "Debt Consolidation", "Home Improvement" , "Business" , "Personal Loan" , "Student Use" , "Auto" , "Other" , "Baby&Adoption",
"Boat" , "Cosmetic Procedure" , "Engagement Ring" , "Green Loans" , "Household Expenses" , "Large Purchases" , "Medical/Dental" , "Motorcycle" ,
"RV" , "Taxes" , "Vacation" , "Wedding Loans"]
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(20 , reverse=True)
sns.countplot(x=df['ListingCategory'], ax=ax , order=df['ListingCategory'].value_counts().index , palette=cmap)
labels = [item.get_text() for item in ax.get_xticklabels()]
for i in range(21):
labels[i] = new[int(labels[i])]
ax.set_xticklabels(labels)
plt.xticks(rotation=25)
plt.title("Loans given out according to Category (in gen)")
for i, v in df['ListingCategory'].value_counts().reset_index().iterrows():
ax.text(i, v.ListingCategory + 0.2 , v.ListingCategory, color='red');
# A Count plot of Listing Category for Elite Loans
new = ["Not Available" , "Debt Consolidation", "Home Improvement" , "Business" , "Personal Loan" , "Student Use" , "Auto" , "Other" , "Baby&Adoption",
"Boat" , "Cosmetic Procedure" , "Engagement Ring" , "Green Loans" , "Household Expenses" , "Large Purchases" , "Medical/Dental" , "Motorcycle" ,
"RV" , "Taxes" , "Vacation" , "Wedding Loans"]
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(20 , reverse=True)
sns.countplot(x=df.query("LoanOriginalAmount > 10000")['ListingCategory'], ax=ax , order=df.query("LoanOriginalAmount > 10000")['ListingCategory'].value_counts().index , palette=cmap)
labels = [item.get_text() for item in ax.get_xticklabels()]
for i in range(21):
labels[i] = new[int(labels[i])]
ax.set_xticklabels(labels)
plt.xticks(rotation=25)
plt.title("Loans given out according to Category (in elite)")
for i, v in df.query("LoanOriginalAmount > 10000")['ListingCategory'].value_counts().reset_index().iterrows():
ax.text(i, v.ListingCategory + 0.2 , v.ListingCategory, color='red');
# Diffrent Occupations available
df["Occupation"].value_counts().iloc[0:20]
# A Count plot of Diffent occupations in genral
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(20 , reverse=True)
sns.countplot(x=df["Occupation"], ax=ax , order=df["Occupation"].value_counts().iloc[2:20].index , palette=cmap)
plt.xticks(rotation=45)
plt.title("Count of loans given out by Ocuupation (in gen)")
for i, v in df["Occupation"].value_counts().iloc[2:20].reset_index().iterrows():
ax.text(i, v.Occupation + 0.2 , v.Occupation, color='red');
# A Count plot of Diffent occupations in elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(20 , reverse=True)
sns.countplot(x=df.query("LoanOriginalAmount > 10000")["Occupation"], ax=ax , order=df.query("LoanOriginalAmount > 10000")["Occupation"].value_counts().iloc[2:20].index , palette=cmap)
plt.xticks(rotation=45)
plt.title("Count of loans given out by Ocuupation (in elite)")
for i, v in df.query("LoanOriginalAmount > 10000")["Occupation"].value_counts().iloc[2:20].reset_index().iterrows():
ax.text(i, v.Occupation + 0.2 , v.Occupation, color='red');
# diffent Credit Grades
df["CreditGrade"].value_counts()
# A Count plot of Diffent Credit Grades in genral
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(9 , reverse=True)
sns.countplot(x=df["CreditGrade"], ax=ax , order=df["CreditGrade"].value_counts().index , palette=cmap)
plt.title("Count of loans given out by Credit Grade (in gen)")
for i, v in df["CreditGrade"].value_counts().reset_index().iterrows():
ax.text(i, v.CreditGrade + 0.2 , v.CreditGrade, color='red');
# A Count plot of Diffent Credit Grades in elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(9 , reverse=True)
sns.countplot(x=df.query("LoanOriginalAmount > 10000")["CreditGrade"], ax=ax , order=df.query("LoanOriginalAmount > 10000")["CreditGrade"].value_counts().index , palette=cmap)
plt.title("Count of loans given out by Credit Grade (in elite)")
for i, v in df.query("LoanOriginalAmount > 10000")["CreditGrade"].value_counts().reset_index().iterrows():
ax.text(i, v.CreditGrade + 0.2 , v.CreditGrade, color='red');
# A Count plot of Diffent Employment Status in genral
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(9 , reverse=True)
sns.countplot(x=df["EmploymentStatus"], ax=ax , order=df["EmploymentStatus"].value_counts().index , palette=cmap)
plt.title("Count of loans given out by Employment Status (in gen)")
for i, v in df["EmploymentStatus"].value_counts().reset_index().iterrows():
ax.text(i, v.EmploymentStatus + 0.2 , v.EmploymentStatus, color='red');
# A Count plot of Diffent Employment Status in elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(9 , reverse=True)
sns.countplot(x=df.query("LoanOriginalAmount > 10000")["EmploymentStatus"], ax=ax , order=df.query("LoanOriginalAmount > 10000")["EmploymentStatus"].value_counts().index , palette=cmap)
plt.title("Count of loans given out by Employment Status (in elite)")
for i, v in df.query("LoanOriginalAmount > 10000")["EmploymentStatus"].value_counts().reset_index().iterrows():
ax.text(i, v.EmploymentStatus + 0.2 , v.EmploymentStatus, color='red');
# ahistogram depecting the relationship between diffrent ranges of loan amount and no of people who recieved it
fig, ax = plt.subplots(figsize=(20 , 10))
bins = np.arange(1000 , df['LoanOriginalAmount'].max()+1000 , 3000)
plt.hist(df['LoanOriginalAmount'] ,bins=bins , ec='blue' , alpha=0.6)
plt.xticks(bins , rotation=30)
plt.xlabel("loan amount")
plt.ylabel("no of people who recieved loan")
plt.title("Count of loans given out by Total Loan Amount");
# a histogram depicting the relationship between the ranges of prosper Scores and no of people having prosper
# score between that prosper score in genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
bins = np.arange(df['ProsperScore'].min() , df['ProsperScore'].max()+1 , 1)
plt.hist(df['ProsperScore'] ,bins=bins , ec='blue' , alpha=0.6)
plt.xticks(bins , rotation=30)
plt.xlabel("Prosper Scores")
plt.ylabel("no of people with that Prosper Score")
plt.title("Count of peoples with given Prosper Score (in gen)");
# a histogram depicting the relationship between the ranges of prosper Scores and no of people having prosper
# score between that prosper score in elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
bins = np.arange(df.query("LoanOriginalAmount > 10000")['ProsperScore'].min() , df.query("LoanOriginalAmount > 10000")['ProsperScore'].max()+1 , 1)
plt.hist(df.query("LoanOriginalAmount > 10000")['ProsperScore'] ,bins=bins , ec='green' ,color = 'green' , alpha=0.6)
plt.xticks(bins , rotation=30)
plt.xlabel("Prosper Scores")
plt.ylabel("no of people with that Prosper Score")
plt.title("Count of peoples with given Prosper Score (in elite)");
# A scatterplot depicting a relationship between BorrowerAPR and BorrowerRate in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.scatterplot(data=df , x="BorrowerAPR" , y="BorrowerRate")
plt.title("Borrower APR vs Borrower Rate (in gen)");
# A scatterplot depicting a relationship between BorrowerAPR and BorrowerRate in case of elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.scatterplot(data=df.query("LoanOriginalAmount > 10000") , x="BorrowerAPR" , y="BorrowerRate")
plt.title("Borrower APR vs Borrower Rate (in elite)");
# A violinplot depicting a relationship between Term and BorrowerRate in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.violinplot(data=df , x="Term" , y="BorrowerRate" , color='blue' );
plt.title("rerm vs BorrowerRate (in gen)");
# A violinplot depicting a relationship between Term and BorrowerRate in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.violinplot(data=df.query("LoanOriginalAmount > 10000") , x="Term" , y="BorrowerRate" , color='green' );
plt.title("rerm vs BorrowerRate (in eite)");
# A boxplot depicting a relationship between EmploymentStatus and LoanOriginalAmount in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = [[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.1750865648952205, 0.11840023306916837, 0.24215989137836502],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],]
sns.boxplot(data=df , x="EmploymentStatus" , y="LoanOriginalAmount" , palette=cmap)
plt.xticks(rotation=25)
plt.title("Employment Status vs Loan Original Amount (in gen)");
# A boxplot depicting a relationship between EmploymentStatus and LoanOriginalAmount in case of elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = [[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.1750865648952205, 0.11840023306916837, 0.24215989137836502],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],
[0.9312692223325372, 0.8201921796082118, 0.7971480974663592],]
sns.boxplot(data=df.query("LoanOriginalAmount > 10000") , x="EmploymentStatus" , y="LoanOriginalAmount" , palette=cmap)
plt.xticks(rotation=25)
plt.title("Employment Status vs Loan Original Amount (in elite)");
# A boxplot depicting a relationship between Prosper Score and LoanOriginalAmount in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(11 )
sns.boxplot(data=df , x="ProsperScore" , y="LoanOriginalAmount" , palette=cmap)
plt.xticks(rotation=25);
plt.title("prosper score vs loanAmount (in gen)");
# A boxplot depicting a relationship between Prosper Score and LoanOriginalAmount in case of elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(11 )
sns.boxplot(data=df.query("LoanOriginalAmount > 10000") , x="ProsperScore" , y="LoanOriginalAmount" , palette=cmap)
plt.xticks(rotation=25);
plt.title("prosper score vs loanAmount (in elite)");
# merging the CreditScoreRangeLower CreditScoreRangeUpper columns into one CreditScoreAvg average of both
col = df.loc[: , "CreditScoreRangeLower":"CreditScoreRangeUpper"]
df["CreditScoreAvg"] = col.mean(axis=1)
# Creatinhg a new column new_emp that cotains only 2 variables employed or not
df['new_emp'] = df["EmploymentStatus"]
df['new_emp'] = df['new_emp'].replace({"Self-employed": "Employed", "Full-time": "Employed", "Part-time": "Employed", "Retired": "Not employed"})
df["new_emp"].unique()
# a stripplot that depicts the relationship between CreditScoreAvg and LoanOriginalAmount with new_emp
# as the third variable in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(data=df , x="CreditScoreAvg" , y="LoanOriginalAmount" , hue = "new_emp");
plt.title("credit score vs loanAmount (in gen)");
# a stripplot that depicts the relationship between CreditScoreAvg and LoanOriginalAmount with new_emp
# as the third variable in case of elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(data=df.query("LoanOriginalAmount > 10000") , x="CreditScoreAvg" , y="LoanOriginalAmount" , hue = "new_emp" );
plt.title("credit score vs loanAmount wrt employ status (in elite)");
# a boxplot that depicts the relationship between Credit Grade and LoanOriginalAmount with new_emp
# as the third variable in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.boxplot(data=df.loc[df['new_emp'].isin(['Employed' , 'Not employed'])] , x="CreditGrade" , y="LoanOriginalAmount" , hue = "new_emp" );
plt.title("credit grade vs loanAmount wrt employed or not(in gen)");
# a boxplot that depicts the relationship between Credit Grade and LoanOriginalAmount with new_emp
# as the third variable in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.boxplot(data=df.query("LoanOriginalAmount > 10000").loc[df['new_emp'].isin(['Employed' , 'Not employed'])] , x="CreditGrade" , y="LoanOriginalAmount" , hue = "new_emp" );
plt.title("credit grade vs loanAmount wrt employed or not(in elite)");
# a stripplot that depicts the relationship between Credit Grade and LoanOriginalAmount with new_emp
# as the third variable in case of genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(data=df.loc[df['new_emp'].isin(['Employed' , 'Not employed'])] , x="CreditGrade" , y="LoanOriginalAmount" , hue = "new_emp" );
plt.title("credit grade vs loanAmount wrt employed or not(in gen)");
# a stripplot that depicts the relationship between Credit Grade and LoanOriginalAmount with new_emp
# as the third variable in case of elite loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(data=df.query("LoanOriginalAmount > 10000").loc[df['new_emp'].isin(['Employed' , 'Not employed'])] , x="CreditGrade" , y="LoanOriginalAmount" , hue = "new_emp" );
plt.title("credit grade vs loanAmount wrt employed or not(in elite)");
# a heatmap of all the columns with |correlation| > 0.5 to fing relations
fig, ax = plt.subplots(figsize=(20 , 20))
df_corr = df.corr()
df_corr[np.abs(df_corr)<.50] = 0
mask = np.zeros_like(df_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_corr , mask=mask , cmap='coolwarm' , linewidths=1)
plt.xlabel("columns")
plt.ylabel("columns")
plt.title("|correlation| > 0.5");
# from the above matrix heatmap we can derive a smaller heatmap
plt.figure(figsize=(15,15))
sns.heatmap(df[['BorrowerAPR', 'BorrowerRate', 'LenderYield', 'EstimatedEffectiveYield','EstimatedLoss' , 'EstimatedReturn' , 'ProsperRating (numeric)' , 'ProsperScore', 'CreditGrade', 'CreditScoreAvg', 'Term','ListingCategory']].corr() , annot=True);
plt.xlabel("columns")
plt.ylabel("columns")
plt.title("correlations");
# exploring the relationship between CreditScoreAvg ProsperScore using heatmap
sns.heatmap(df[['CreditScoreAvg' , 'ProsperScore']].corr() , annot=True)
plt.title("credit score and Prosper Score");
# exploring the relationship between term and Creditscoreavg using boxplot
fig, ax = plt.subplots(figsize=(20 , 10))
sns.boxplot(x ='Term', y ='CreditScoreAvg', data = df,
);
plt.title("credit score and Term");
# exploring the relationship between CreditScoreAvg and BorrowerAPR using stripplot
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(data=df , x="CreditScoreAvg" , y="BorrowerAPR" , jitter = True, dodge = True);
plt.title("credit score and BorrowerAPR");
# exploring the relationship between ProsperRating (numeric) and BorrowerAPR using stripplot
fig, ax = plt.subplots(figsize=(20 , 10))
sns.stripplot(x ='ProsperRating (numeric)', y ='BorrowerAPR', data = df,
jitter = True, dodge = True);
plt.title("Prosper Rating and BorrowerAPR");
# Plotting Correlationds using scattermatrix
df_scatter = df[['BorrowerAPR', 'BorrowerRate', 'LenderYield', 'EstimatedEffectiveYield','EstimatedLoss' , 'EstimatedReturn' , 'ProsperRating (numeric)' , 'ProsperScore', 'CreditGrade', 'CreditScoreAvg', 'Term','ListingCategory']]
sns.pairplot(df_scatter)
plt.xlabel("columns")
plt.ylabel("columns")
plt.title("correlations");
From the above visualizations we get to know
# exploring the relationship between BorrowerRate and LoanStatus using boxplot in genral loans
fig, ax = plt.subplots(figsize=(20 , 10))
sns.boxplot(x ='BorrowerRate', y ='LoanStatus', data = df)
plt.title("Loan STates and Borrower Rate (in gen)");
# exploring the relationship between BorrowerRate and LoanStatus using boxplot in elite loan
fig, ax = plt.subplots(figsize=(20 , 10))
sns.boxplot(x ='BorrowerRate', y ='LoanStatus', data = df.query("LoanOriginalAmount > 10000"))
plt.title("Loan STates and Borrower Rate (in elite)");
# exploring the relationship between ProsperRating and LoanStatus using heatmap in gen loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.pivot_table(index='LoanStatus',
columns='ProsperRating (numeric)',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap , annot=True)
plt.title("Loan STates and ProsperRating (in gen)")
# exploring the relationship between ProsperRating and ListingKey using heatmap in elite loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.query("LoanOriginalAmount > 10000").pivot_table(index='LoanStatus',
columns='ProsperRating (numeric)',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap , annot=True)
plt.title("Loan STates and ProsperRating (in elite)");
# exploring the relationship between LoanStatus and Term using heatmap in genral loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.pivot_table(index='LoanStatus',
columns='Term',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap , annot=True)
plt.title("Loan STates and Term (in gen)");
# exploring the relationship between LoanStatus and Term using heatmap in elite loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.query("LoanOriginalAmount > 10000").pivot_table(index='LoanStatus',
columns='Term',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap , annot=True)
plt.title("Loan STates and Term (in elite)");
# exploring the relationship between LoanStatus and CreditScoreAvg using heatmap in genral loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.pivot_table(index='LoanStatus',
columns='CreditScoreAvg',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap )
plt.title("Loan STates and CreditScoreAvg (in gen)");
# exploring the relationship between LoanStatus and CreditScoreAvg using heatmap in elite loan
fig, ax = plt.subplots(figsize=(20 , 10))
df_heatmap = df.query("LoanOriginalAmount > 10000").pivot_table(index='LoanStatus',
columns='CreditScoreAvg',
values='ListingKey',
aggfunc=lambda x: len(x.unique()))
sns.heatmap(df_heatmap )
plt.title("Loan STates and CreditScoreAvg (in gen)");
From the above visualizations we get to know-
# exploring the relationship between ListingCategory and LoanOriginalAmount using barplot in genral loan
new = ["Not Available" , "Debt Consolidation", "Home Improvement" , "Business" , "Personal Loan" , "Student Use" , "Auto" , "Other" , "Baby&Adoption",
"Boat" , "Cosmetic Procedure" , "Engagement Ring" , "Green Loans" , "Household Expenses" , "Large Purchases" , "Medical/Dental" , "Motorcycle" ,
"RV" , "Taxes" , "Vacation" , "Wedding Loans"]
fig, ax = plt.subplots(figsize=(20 , 10))
cmap = sns.cubehelix_palette(21 , reverse=True)
sns.barplot(data=df , x="ListingCategory" , y="LoanOriginalAmount" ,palette=cmap )
labels = [item.get_text() for item in ax.get_xticklabels()]
for i in range(21):
labels[i] = new[int(labels[i])]
ax.set_xticklabels(labels)
plt.xticks(rotation=25);
plt.title("Loan Categories and Loan amount");
- Having credit score >729.5 , grade AA , A , B, D , C and Prosper Score in between (6-9) and having a job of Executive Followed by computer programmer demanding a loan for Debt Consolidation will ensure higher chances to get an Elite Loan(loan>10000).
- Lender Yield is directly proportional to borrowAPR , Borrow Rate and Estimated Effective Yield